import pandas as pdimport plotly.express as pximport plotly.io as piofrom pyspark.sql import SparkSessionimport reimport numpy as npimport plotly.graph_objects as gofrom pyspark.sql.functions import col, split, explode, regexp_replace, transform, whenfrom pyspark.sql import functions as Ffrom pyspark.sql.functions import col, monotonically_increasing_idnp.random.seed(42)pio.renderers.default ="notebook"# Initialize Spark Sessionspark = SparkSession.builder.appName("LightcastData").getOrCreate()# Load Datadf = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("./data/lightcast_job_postings.csv")df.createOrReplaceTempView("job_postings")# Show Schema and Sample Data#print("---This is Diagnostic check, No need to print it in the final doc---")#df.printSchema() # comment this line when rendering the submission#df.show(5)
WARNING: Using incubator modules: jdk.incubator.vector
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/17 21:33:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Stage 1:> (0 + 1) / 1] 25/10/17 21:33:22 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
2 Data Cleaning
from pyspark.sql.functions import coldf = df.withColumn("SALARY", col("SALARY").cast("float"))df = df.withColumn("SALARY_FROM", col("SALARY_FROM").cast("float"))df = df.withColumn("SALARY_TO", col("SALARY_TO").cast("float"))df = df.withColumn("MIN_YEARS_EXPERIENCE", col("MIN_YEARS_EXPERIENCE").cast("float"))df = df.withColumn("MAX_YEARS_EXPERIENCE", col("MAX_YEARS_EXPERIENCE").cast("float"))# Compute median salarymedian_from = df.approxQuantile("SALARY_FROM", [0.5], 0.01)[0]median_to = df.approxQuantile("SALARY_TO", [0.5], 0.01)[0]median_salary = df.approxQuantile("SALARY", [0.5], 0.01)[0]print("Medians:",median_from, median_to, median_salary)# Impute missing 'SALARY_FROM' and 'SALARY_TO' with their mediansdf = df.fillna({"SALARY_FROM": median_from,"SALARY_TO": median_to,"SALARY": median_salary})# Compute 'AVERAGE_SALARY'df = df.withColumn("AVERAGE_SALARY", (col("SALARY_FROM") + col("SALARY_TO")) /2)# Impute missing 'SALARY' with AVERAGE_SALARY, and if that's missing, with the median salaryfrom pyspark.sql.functions import whendf = df.withColumn("SALARY", when( col("SALARY").isNull(), when(col("AVERAGE_SALARY").isNotNull(), col("AVERAGE_SALARY")) .otherwise(median_salary) ).otherwise(col("SALARY")))from pyspark.sql.functions import regexp_replacedf = df.withColumn("EDUCATION_LEVELS_NAME", regexp_replace(col("EDUCATION_LEVELS_NAME"), r'[\n\r]', ''))# Overwritedf.write.option("header", True).mode("overwrite").csv("data/lightcast_job_postings_cleaned.csv")# Display row countprint(f"Rows retained after cleaning: {df.count()}")
Salaries vary widely between industries, with sectors like Information and Finance & Insurance generally showing higher salary ranges than industries such as Accommodation and Food Services. Full-time positions tend to have higher median salaries across most industries compared to part-time or other employment types.
4 Salary Analysis by ONET Occupation Type (Bubble Chart)
# Lot Occupation Namesalary_analysis = spark.sql(""" SELECT LOT_OCCUPATION_NAME AS OCCUPATION_NAME, PERCENTILE(SALARY, 0.5) AS Median_Salary, COUNT(*) AS Job_Postings FROM job_postings GROUP BY LOT_OCCUPATION_NAME ORDER BY Job_Postings DESC LIMIT 10""")salary_pd = salary_analysis.toPandas()salary_pd.head()import plotly.express as pxfig = px.scatter( salary_pd, x="OCCUPATION_NAME", y="Median_Salary", size="Job_Postings", title="Salary Analysis by LOT Occupation Type (Bubble Chart)", labels={"OCCUPATION_NAME": "LOT Occupation","Median_Salary": "Median Salary","Job_Postings": "Number of Job Postings" }, hover_name="OCCUPATION_NAME", size_max=60, width=1000, height=600, color="Job_Postings", color_continuous_scale="Viridis",)# Layout Customizationfig.update_layout( font_family="Arial", font_size=14, title_font_size=25, xaxis_title="LOT Occupation", yaxis_title="Median Salary", plot_bgcolor="#f6f9fa", xaxis=dict( tickangle=-60, showline=True, linecolor="#444" ), yaxis=dict( showline=True, linecolor="#444" ), xaxis_title_font=dict(size=17), yaxis_title_font=dict(size=17),)fig.show()
[Stage 10:> (0 + 1) / 1]
5 Salary by Education Level
# Education levelslower_deg = ["Bachelor", "Associate", "GED", "No Education Listed", "High school"]higher_deg = ["Master", "PhD", "Doctorate", "professional degree"]# Add EDU_GROUP columndf = df.withColumn("EDU_GROUP", when( col("EDUCATION_LEVELS_NAME").rlike("|".join([f"(?i){deg}"for deg in lower_deg])),"Bachelor's or lower" ).when( col("EDUCATION_LEVELS_NAME").rlike("|".join([f"(?i){deg}"for deg in higher_deg])),"Master's or PhD" ).otherwise("Other"))# Cast columnsdf = df.withColumn("MAX_YEARS_EXPERIENCE", col("MAX_YEARS_EXPERIENCE").cast("float"))df = df.withColumn("Average_Salary", col("Average_Salary").cast("float"))# Filter for non-null and positive valuesdf = df.filter( (col("MAX_YEARS_EXPERIENCE").isNotNull()) & (col("Average_Salary").isNotNull()) & (col("MAX_YEARS_EXPERIENCE") >0) & (col("Average_Salary") >0))# Filter for education groupsdf_filtered = df.filter( col("EDU_GROUP").isin("Bachelor's or lower", "Master's or PhD"))# Convert to Pandasdf_pd = df_filtered.toPandas()df_pd.head()# Scatter plot: Experience vs. Salary by Education Groupimport plotly.express as px# Plotfig1 = px.scatter( df_pd, x="MAX_YEARS_EXPERIENCE", y="Average_Salary", color="EDU_GROUP", hover_data=["LOT_V6_SPECIALIZED_OCCUPATION_NAME"], title="<b>Experience vs Salary by Education Level</b>", opacity=0.7, color_discrete_sequence=["#36B37E", "#A259EC"] # Custom green & purple)# Add bordersfig1.update_traces(marker=dict(size=7, line=dict(width=1, color="black")))# Update layoutfig1.update_layout( plot_bgcolor="#f9f9f9", paper_bgcolor="#EAF7FF", # Softer blue background font=dict(family="Segoe UI", size=14), title_font=dict(size=22), xaxis_title="Years of Experience", yaxis_title="Average Salary (USD)", legend_title="Education Group", hoverlabel=dict(bgcolor="white", font_size=13, font_family="Arial"), margin=dict(t=70, b=60, l=60, r=60), xaxis=dict( gridcolor="lightgrey", tickmode="linear", dtick=1, zeroline=False ), yaxis=dict( gridcolor="lightgrey", zeroline=False ))fig1.show()